
/******************************************************************************
 *
 *  This file is part of meryl, a genomic k-kmer counter with nice features.
 *
 *  This software is based on:
 *    'Canu' v2.0              (https://github.com/marbl/canu)
 *  which is based on:
 *    'Celera Assembler' r4587 (http://wgs-assembler.sourceforge.net)
 *    the 'kmer package' r1994 (http://kmer.sourceforge.net)
 *
 *  Except as indicated otherwise, this is a 'United States Government Work',
 *  and is released in the public domain.
 *
 *  File 'README.licenses' in the root directory of this distribution
 *  contains full conditions and disclaimers.
 */

#include "meryl-lookup.H"

#include "system.H"



class filterInput {
public:
  filterInput() {
  };
  ~filterInput() {
  };

  dnaSeq        seq1;
  dnaSeq        seq2;

  uint64        nTotal;
  uint64        nFound;
};




static
void *
loadSequence(void *G) {
  lookupGlobal *g = (lookupGlobal *)G;
  filterInput  *s = new filterInput;

  bool   load1 = (g->seqFile1 != nullptr) && (g->seqFile1->loadSequence(s->seq1) == true);
  bool   load2 = (g->seqFile2 != nullptr) && (g->seqFile2->loadSequence(s->seq2) == true);

  if ((load1 == false) &&
      (load2 == false)) {
    delete s;
    return(nullptr);
  }

  return(s);
}



static
uint64
processSequence(merylExactLookup *L, dnaSeq &seq, bool is10x) {
  kmerIterator kiter(seq.bases(), seq.length());
  uint64       found = 0;

  //  Ignore the first 23 kmers in seq

  while (kiter.nextMer()) {
    if (is10x && kiter.bgnPosition() < 23)  continue;
    if ((L->value(kiter.fmer()) > 0) ||
        (L->value(kiter.rmer()) > 0))
      found++;
  }

  return(found);
}


static
void
processSequence(void *G, void *T, void *S) {
  lookupGlobal *g = (lookupGlobal *)G;
  filterInput  *s = (filterInput  *)S;

  //  Count the number of kmers found in the database from either
  //  seq1 or seq2.
  //
  //  If this is 10X Genomics reads, ignore counting in the first 23 bp of the seq1.

  s->nFound  = processSequence(g->lookupDBs[0], s->seq1, g->is10x);
  s->nFound += processSequence(g->lookupDBs[0], s->seq2, false);
}



static
void
outputSequence(compressedFileWriter  *O,
               dnaSeq                &seq,
               uint64                 nFound) {

  if (O == nullptr)
    return;

  if (seq.quals()[0] == 0)   outputFASTA(O->file(), seq.bases(),              seq.length(), 0, "%s nKmers=%lu", seq.ident(), nFound);
  else                       outputFASTQ(O->file(), seq.bases(), seq.quals(), seq.length(),    "%s nKmers=%lu", seq.ident(), nFound);
}


static
void
outputSequence(void *G, void *S) {
  lookupGlobal *g = (lookupGlobal *)G;
  filterInput  *s = (filterInput  *)S;

  g->nReadsTotal++;

  //  Write output if:
  //    'include' and    mers found.
  //    'exclude' and no mers found.

  if (((s->nFound  > 0) && (g->reportType == lookupOp::opInclude)) ||
      ((s->nFound == 0) && (g->reportType == lookupOp::opExclude))) {
    g->nReadsFound++;

    outputSequence(g->outFile1, s->seq1, s->nFound);
    outputSequence(g->outFile2, s->seq2, s->nFound);
  }

  delete s;
}



void
filter(lookupGlobal *g) {

  if (g->is10x)
    fprintf(stderr, "\nRunning in 10x mode. The first 23 bp of every sequence in %s will be ignored while looking up.\n", g->seqFile1->filename());
  
  sweatShop     *ss = new sweatShop(loadSequence, processSequence, outputSequence);

  ss->setLoaderQueueSize(10240);
  ss->setNumberOfWorkers(omp_get_max_threads());
  ss->setWriterQueueSize(omp_get_max_threads());

  ss->run(g, g->showProgress);

  delete ss;

  fprintf(stderr, "\nIncluding %lu reads (or read pairs) out of %lu.\n", g->nReadsTotal, g->nReadsFound);
}

